{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!/usr/bin/env python\n",
    "# -*- coding: utf-8 -*-\n",
    "\n",
    "import sys\n",
    "sys.path.append('../')\n",
    "\n",
    "import numpy as np\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn import svm\n",
    "\n",
    "from loglizer.models import PCA, IsolationForest, LogClustering, OneClassSVM\n",
    "from loglizer import dataloader, preprocessing\n",
    "from loglizer.utils import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/haixuanguo/Documents/deeplog_copy_github/HDFS/../loglizer/dataloader.py:286: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
      "  train = np.array(train).reshape(-1,1)\n",
      "/home/haixuanguo/Documents/deeplog_copy_github/HDFS/../loglizer/dataloader.py:292: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
      "  test_normal = np.array(test_normal).reshape(-1,1)\n",
      "/home/haixuanguo/Documents/deeplog_copy_github/HDFS/../loglizer/dataloader.py:298: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
      "  abnormal = np.array(abnormal).reshape(-1,1)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train normal size: 5000\n",
      "Train abnormal size: 2500\n",
      "Test normal size: 553223\n",
      "Test abnormal size: 14338\n",
      "====== Transformed train data summary ======\n",
      "Train data shape: 7500-by-40\n",
      "\n",
      "====== Transformed test data summary ======\n",
      "Test data shape: 567561-by-40\n",
      "\n"
     ]
    }
   ],
   "source": [
    "ouput_dir = \"../output/hdfs/\"\n",
    "(x_train, y_train), (x_test, y_test) = dataloader.load_data(data_dir=ouput_dir)\n",
    "feature_extractor = preprocessing.FeatureExtractor()\n",
    "x_train = feature_extractor.fit_transform(x_train)\n",
    "x_test = feature_extractor.transform(x_test)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "==================== Model: PCA ====================\n",
      "theshold 0\n",
      "====== Model summary ======\n",
      "n_components: 2\n",
      "Project matrix shape: 40-by-40\n",
      "SPE threshold: 1\n",
      "\n",
      "Train validation:\n",
      "====== Evaluation summary ======\n",
      "Confusion Matrix: TP: 2500, FP: 2098, TN: 2902, FN: 0\n",
      "Precision: 54.371%, recall: 100.000%, F1-measure: 70.442%\n",
      "\n",
      "Test validation:\n",
      "====== Evaluation summary ======\n",
      "Confusion Matrix: TP: 14338, FP: 230566, TN: 322657, FN: 0\n",
      "Precision: 5.854%, recall: 100.000%, F1-measure: 11.062%\n",
      "\n",
      "CPU times: user 3.4 s, sys: 79.5 ms, total: 3.48 s\n",
      "Wall time: 1.75 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "print(\"=\"*20 + \" Model: PCA \" + \"=\"*20)\n",
    "for th in np.arange(1):\n",
    "    print(\"theshold\", th)\n",
    "    model = PCA(n_components=0.8, threshold=1, c_alpha = 1.9600)\n",
    "    model.fit(x_train)\n",
    "    print('Train validation:')\n",
    "    precision, recall, f1 = model.evaluate(x_train, y_train)\n",
    "    print('Test validation:')\n",
    "    precision, recall, f1 = model.evaluate(x_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "==================== Model: IsolationForest ====================\n",
      "====== Model summary ======\n",
      "Train validation:\n",
      "====== Evaluation summary ======\n",
      "Confusion Matrix: TP: 1718, FP: 26, TN: 4974, FN: 782\n",
      "Precision: 98.509, recall: 68.720, F1-measure: 80.961\n",
      "\n",
      "Test validation:\n",
      "====== Evaluation summary ======\n",
      "Confusion Matrix: TP: 9786, FP: 2570, TN: 550653, FN: 4552\n",
      "Precision: 79.200, recall: 68.252, F1-measure: 73.320\n",
      "\n",
      "CPU times: user 15.5 s, sys: 3.4 s, total: 18.9 s\n",
      "Wall time: 18.8 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "print(\"=\"*20 + \" Model: IsolationForest \" + \"=\"*20)\n",
    "model = IsolationForest(n_estimators=100, max_samples='auto', contamination='auto', random_state=19)\n",
    "model.fit(x_train)\n",
    "print('Train validation:')\n",
    "precision, recall, f1 = model.evaluate(x_train, y_train)\n",
    "print('Test validation:')\n",
    "precision, recall, f1 = model.evaluate(x_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "==================== Model: one class SVM ====================\n",
      "====== Model summary ======\n",
      "Train validation:\n",
      "====== Evaluation summary ======\n",
      "Confusion Matrix: TP: 469, FP: 5000, TN: 0, FN: 2031\n",
      "Precision: 8.576, recall: 18.760, F1-measure: 11.771\n",
      "\n",
      "Test validation:\n",
      "====== Evaluation summary ======\n",
      "Confusion Matrix: TP: 2769, FP: 553223, TN: 0, FN: 11569\n",
      "Precision: 0.498, recall: 19.312, F1-measure: 0.971\n",
      "\n",
      "CPU times: user 1min 50s, sys: 51.9 ms, total: 1min 50s\n",
      "Wall time: 1min 50s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "print(\"=\"*20 + \" Model: one class SVM \" + \"=\"*20)\n",
    "model = OneClassSVM(kernel='rbf')\n",
    "model.fit(x_train, y_train)\n",
    "\n",
    "print('Train validation:')\n",
    "precision, recall, f1 = model.evaluate(x_train, y_train)\n",
    "print('Test validation:')\n",
    "precision, recall, f1 = model.evaluate(x_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "==================== Model: one class SVM ====================\n",
      "CPU times: user 2min 50s, sys: 3.56 s, total: 2min 54s\n",
      "Wall time: 2min 54s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=5, estimator=OneClassSVM(),\n",
       "             param_grid={'gamma': [0.001, 0.01, 0.1, 1],\n",
       "                         'kernel': ['rbf', 'poly', 'linear', 'sigmoid'],\n",
       "                         'nu': [0.001, 0.01, 0.1, 1]},\n",
       "             scoring='f1_micro')"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# %%time\n",
    "# print(\"=\"*20 + \" Model: one class SVM \" + \"=\"*20)\n",
    "\n",
    "# nus = [0.001, 0.01, 0.1, 1]\n",
    "# gammas = [0.001, 0.01, 0.1, 1]\n",
    "# tuned_parameters = {'kernel' : ['rbf','poly','linear','sigmoid'], 'gamma' : gammas, 'nu': nus}\n",
    "\n",
    "# ocsvm = svm.OneClassSVM()\n",
    "# model = GridSearchCV(ocsvm, tuned_parameters, cv=5, scoring=\"f1_micro\")\n",
    "\n",
    "# model.fit(x_train, y_train.astype(int))\n",
    "\n",
    "# # print('Train validation:')\n",
    "# # precision, recall, f1 = model.predict(x_train, y_train.astype(int))\n",
    "# # print('Test validation:')\n",
    "# # precision, recall, f1 = model.predict(x_test, y_test.astype(int))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train validation:\n",
      "Confusion Matrix: TP: 1543, FP: 5000, TN: 0, FN: 957\n",
      "Precision: 23.582, recall: 61.720, F1-measure: 34.126\n",
      "\n",
      "Test validation:\n",
      "Confusion Matrix: TP: 9114, FP: 553223, TN: 0, FN: 5224\n",
      "Precision: 1.621, recall: 63.565, F1-measure: 3.161\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# print('Train validation:')\n",
    "# y_eval = model.predict(x_train)\n",
    "# precision, recall, f1 = metrics(y_eval, y_train)\n",
    "# print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\\n'.format(precision, recall, f1))\n",
    "    \n",
    "# print('Test validation:')\n",
    "# y_pred = model.predict(x_test)\n",
    "# precision, recall, f1 = metrics(y_pred, y_test)\n",
    "# print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\\n'.format(precision, recall, f1))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "==================== Model: LogClustering ====================\n",
      "====== Model summary ======\n",
      "Starting offline clustering...\n",
      "Processed 1000 instances.\n",
      "Found 4 clusters offline.\n",
      "\n",
      "Starting online clustering...\n",
      "Processed 2000 instances.\n",
      "Processed 4000 instances.\n",
      "Processed 5000 instances.\n",
      "Found 4 clusters online.\n",
      "\n",
      "Train validation:\n",
      "====== Evaluation summary ======\n",
      "Confusion Matrix: TP: 960, FP: 0, TN: 5000, FN: 1540\n",
      "Precision: 100.000, recall: 38.400, F1-measure: 55.491\n",
      "\n",
      "Test validation:\n",
      "====== Evaluation summary ======\n",
      "Confusion Matrix: TP: 5251, FP: 40, TN: 553183, FN: 9087\n",
      "Precision: 99.244, recall: 36.623, F1-measure: 53.502\n",
      "\n",
      "CPU times: user 26.9 s, sys: 4.13 ms, total: 27 s\n",
      "Wall time: 26.9 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "print(\"=\"*20 + \" Model: LogClustering \" + \"=\"*20)\n",
    "max_dist = 0.3  # the threshold to stop the clustering process\n",
    "anomaly_threshold = 0.3  # the threshold for anomaly detection\n",
    "model = LogClustering(max_dist=max_dist, anomaly_threshold=anomaly_threshold)\n",
    "model.fit(x_train[y_train == 0, :])  # Use only normal samples for training\n",
    "print('Train validation:')\n",
    "precision, recall, f1 = model.evaluate(x_train, y_train)\n",
    "print('Test validation:')\n",
    "precision, recall, f1 = model.evaluate(x_test, y_test)\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.5"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "metadata": {
     "collapsed": false
    },
    "source": []
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
